{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import codecs\n",
    "import csv\n",
    "import time\n",
    "import os\n",
    "import re\n",
    "import gzip\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH_TO_DATA = 'Data/'\n",
    "PATH_TO_DATA_EN = PATH_TO_DATA+\"enwiki/\"\n",
    "PATH_TO_DATA_UK = PATH_TO_DATA+\"ukwiki/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ENWIKI_ART_FNMS = []\n",
    "for file in os.listdir(PATH_TO_DATA_EN):\n",
    "    if re.match(r\"enwiki-20180620-pages-meta-current\\d{2}-p\\d+p\\d+.xml_art.csv.gz\", file):\n",
    "        ENWIKI_ART_FNMS.append(file)\n",
    "ENWIKI_RED_FNMS = []\n",
    "for file in os.listdir(PATH_TO_DATA_EN):\n",
    "    if re.match(r\"enwiki-20180620-pages-meta-current\\d{2}-p\\d+p\\d+.xml_red.csv.gz\", file):\n",
    "        ENWIKI_RED_FNMS.append(file)        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def unpack(file_name):\n",
    "    file_name_new = file_name.replace(\".gz\",\"\")\n",
    "    with gzip.open(file_name, 'rb') as f_in, open(file_name_new, 'wb') as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "    return file_name_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pack_and_remove(file_name):\n",
    "    file_name_new = file_name+'.gz'\n",
    "    with open(file_name, 'rb') as f_in, gzip.open(file_name_new, 'wb') as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "    os.remove(file_name)\n",
    "    return file_name_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_en_id_name = None\n",
    "for fn in ENWIKI_ART_FNMS:\n",
    "    fn = PATH_TO_DATA_EN+fn\n",
    "    print(fn)\n",
    "    fn_new = unpack(fn)\n",
    "    df_articles = pd.read_csv(fn_new, encoding='ISO-8859-1', quotechar=\"\\\"\")\n",
    "    df_en_id_name_tmp = df_articles[['id', 'title', 'text_len']].drop_duplicates()\n",
    "    df_en_id_name_tmp.columns = ['id', 'title', 'length']\n",
    "    #print(\"df_id_name_tmp size: {}\".format(df_id_name_tmp.shape))\n",
    "    if df_en_id_name is not None:\n",
    "        df_en_id_name = df_en_id_name.append(df_en_id_name_tmp)        \n",
    "        #print(\"append\")\n",
    "    else:\n",
    "        df_en_id_name = df_en_id_name_tmp\n",
    "        #print(\"assign\")\n",
    "    print(\"df_en_id_name size: {}\".format(df_en_id_name.shape))\n",
    "    os.remove(fn_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_en_id_name = df_en_id_name.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_en_id_name.to_csv(PATH_TO_DATA_EN+'enwiki-20180620-id_name.csv', index = False, encoding='ISO-8859-1', quotechar=\"'\", escapechar =\"\\\\\", quoting=csv.QUOTE_NONNUMERIC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_en_id_name = pd.read_csv(PATH_TO_DATA_EN+'enwiki-20180620-id_name.csv', encoding='ISO-8859-1', quotechar=\"'\", escapechar =\"\\\\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12</td>\n",
       "      <td>Anarchism</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>25</td>\n",
       "      <td>Autism</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>39</td>\n",
       "      <td>Albedo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>290</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>303</td>\n",
       "      <td>Alabama</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id      title\n",
       "0   12  Anarchism\n",
       "1   25     Autism\n",
       "2   39     Albedo\n",
       "3  290          A\n",
       "4  303    Alabama"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_en_id_name = df_en_id_name[['id', 'title']]\n",
    "df_en_id_name.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_id_orig_title_alias = pd.read_csv(PATH_TO_DATA_EN+'enwiki-20180620-id_orig_title_alias.csv', encoding='ISO-8859-1', quotechar=\"'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_orig</th>\n",
       "      <th>title_alias</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>411964</td>\n",
       "      <td>AccessibleComputing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13813</td>\n",
       "      <td>AfghanistanHistory</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12681</td>\n",
       "      <td>AfghanistanGeography</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>66468</td>\n",
       "      <td>AfghanistanPeople</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6684</td>\n",
       "      <td>AfghanistanCommunications</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id_orig                title_alias\n",
       "0   411964        AccessibleComputing\n",
       "1    13813         AfghanistanHistory\n",
       "2    12681       AfghanistanGeography\n",
       "3    66468          AfghanistanPeople\n",
       "4     6684  AfghanistanCommunications"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_id_orig_title_alias.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13981955, 2)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_id_orig_title_alias.columns = ['id', 'title']\n",
    "df_en_id_name_all = df_en_id_name.append(df_id_orig_title_alias)\n",
    "df_en_id_name_all.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13230029, 2)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_en_id_name_all['title_lc'] = df_en_id_name_all['title'].str.lower()\n",
    "df_en_id_name_all = df_en_id_name_all[['id', 'title_lc']]\n",
    "df_en_id_name_all = df_en_id_name_all.drop_duplicates()\n",
    "df_en_id_name_all.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data/enwiki/enwiki-20180620-pages-meta-current01-p10p30303.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current02-p30304p88444.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current03-p88445p200507.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current04-p200511p352689.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current05-p352690p565313.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current06-p565314p892912.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current07-p892914p1268691.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current08-p1268692p1791079.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current09-p1791080p2336422.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current10-p2336425p3046511.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current11-p3046514p3926861.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current12-p3926863p5040436.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current13-p5040438p6197594.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current14-p6197598p7697598.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current14-p7697598p7744799.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current15-p7744803p9244803.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current15-p9244803p9518048.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current16-p11018050p11539266.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current16-p9518050p11018050.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current17-p11539268p13039268.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current17-p13039268p13693071.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current18-p13693074p15193074.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current18-p15193074p16120542.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current19-p16120543p17620543.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current19-p17620543p18754735.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current20-p18754736p20254736.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current20-p20254736p21222156.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current21-p21222158p22722158.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current21-p22722158p23927983.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current22-p23927984p25427984.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current22-p25427984p26823660.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current23-p26823661p28323661.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current23-p28323661p29823661.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current23-p29823661p30503449.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current24-p30503451p32003451.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current24-p32003451p33503451.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current24-p33503451p33952815.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current25-p33952816p35452816.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current25-p35452816p36952816.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current25-p36952816p38067202.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p38067203p39567203.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p39567203p41067203.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p41067203p42567203.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current26-p42567203p42663461.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p42663462p44163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p44163462p45663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p45663462p47163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p47163462p48663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p48663462p50163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p50163462p51663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p51663462p53163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p53163462p54663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p54663462p56163462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p56163462p57663462.xml_art.csv.gz\n",
      "Data/enwiki/enwiki-20180620-pages-meta-current27-p57663462p57726175.xml_art.csv.gz\n"
     ]
    }
   ],
   "source": [
    "for fn in ENWIKI_ART_FNMS:\n",
    "    fn = PATH_TO_DATA_EN+fn\n",
    "    print(fn)\n",
    "    fn_new = unpack(fn)\n",
    "    df_articles = pd.read_csv(fn_new, encoding='ISO-8859-1', quotechar=\"\\\"\")\n",
    "    df_articles['link_val'] = df_articles['link_val'].str.split(\"#\").str[0]\n",
    "    df_articles = df_articles[df_articles['link_val'].notna()]\n",
    "    df_articles = df_articles[df_articles['link_val'].str.strip() != '']\n",
    "    df_articles['link_pos_perc'] = (df_articles['link_pos']/df_articles['text_len']).round(3)       \n",
    "    df_articles['link_val_lc'] = df_articles['link_val'].str.lower()\n",
    "    df_articles = pd.merge(df_articles, df_en_id_name_all, how='left', left_on=['link_val_lc'], right_on=['title_lc'])\n",
    "    df_articles = df_articles[['id_x', 'id_y', 'link_pos', 'link_pos_perc', 'link_val', 'link_txt']]\n",
    "    df_articles.columns = ['id', 'link_id', 'link_pos', 'link_pos_perc', 'link_val', 'link_text']\n",
    "    df_articles[['link_id']] = df_articles[['link_id']].fillna(-1).astype(int)\n",
    "    df_articles[['link_pos']] = df_articles[['link_pos']].fillna(-1).astype(int)\n",
    "    df_articles[['link_pos_perc']] = df_articles[['link_pos_perc']].fillna(-1).astype(int)\n",
    "    df_articles['is_red_link'] = np.where(df_articles['link_id'] == -1, True, False)\n",
    "    df_articles.to_csv(fn_new, index = False, encoding='ISO-8859-1', quotechar=\"'\", \n",
    "                       escapechar =\"\\\\\", quoting=csv.QUOTE_NONNUMERIC)\n",
    "    pack_and_remove(fn_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "UKWIKI_ART_FNMS = []\n",
    "for file in os.listdir(PATH_TO_DATA_UK):\n",
    "    if re.match(r\"ukwiki-20180620-pages-meta-current\\d{2}-p\\d+p\\d+.xml_art.csv.gz\", file):\n",
    "        UKWIKI_ART_FNMS.append(file)\n",
    "UKWIKI_RED_FNMS = []\n",
    "for file in os.listdir(PATH_TO_DATA_UK):\n",
    "    if re.match(r\"ukwiki-20180620-pages-meta-current\\d{2}-p\\d+p\\d+.xml_red.csv.gz\", file):\n",
    "        UKWIKI_RED_FNMS.append(file)   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_uk_id_name = None\n",
    "for fn in UKWIKI_ART_FNMS:\n",
    "    fn = PATH_TO_DATA_UK+fn\n",
    "    print(fn)\n",
    "    fn_new = unpack(fn)\n",
    "    df_articles = pd.read_csv(fn_new, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "    df_id_name_tmp = df_articles[['id', 'title', 'text_len']].drop_duplicates()\n",
    "    df_id_name_tmp.columns = ['id', 'title', 'length']\n",
    "    #print(\"df_id_name_tmp size: {}\".format(df_id_name_tmp.shape))\n",
    "    if df_uk_id_name is not None:\n",
    "        df_uk_id_name = df_uk_id_name.append(df_id_name_tmp)        \n",
    "        #print(\"append\")\n",
    "    else:\n",
    "        df_uk_id_name = df_id_name_tmp\n",
    "        #print(\"assign\")\n",
    "    print(\"df_uk_id_name size: {}\".format(df_uk_id_name.shape))\n",
    "    os.remove(fn_new)\n",
    "df_uk_id_name = df_uk_id_name.drop_duplicates()\n",
    "df_uk_id_name.to_csv(PATH_TO_DATA_UK+'ukwiki-20180620-id_name.csv', index = False, encoding='UTF-8', quotechar=\"\\\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>Головна сторінка</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13</td>\n",
       "      <td>Географія</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>584</td>\n",
       "      <td>Атом</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>585</td>\n",
       "      <td>Мільярд</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>586</td>\n",
       "      <td>Ядро</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id             title\n",
       "0    3  Головна сторінка\n",
       "1   13         Географія\n",
       "2  584              Атом\n",
       "3  585           Мільярд\n",
       "4  586              Ядро"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_uk_id_name = pd.read_csv(PATH_TO_DATA_UK+'ukwiki-20180620-id_name.csv',  encoding='UTF-8', quotechar=\"\\\"\")\n",
    "df_uk_id_name = df_uk_id_name[['id', 'title']]\n",
    "df_uk_id_name.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_orig</th>\n",
       "      <th>title_alias</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>610</td>\n",
       "      <td>Esperanto</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9701</td>\n",
       "      <td>Wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>HomePage</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>61817</td>\n",
       "      <td>Капустянко Микола</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6083</td>\n",
       "      <td>Володимир Винниченко</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id_orig           title_alias\n",
       "0      610             Esperanto\n",
       "1     9701             Wikipedia\n",
       "2        3              HomePage\n",
       "3    61817     Капустянко Микола\n",
       "4     6083  Володимир Винниченко"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_uk_id_orig_title_alias = pd.read_csv(PATH_TO_DATA_UK+'ukwiki-20180620-id_orig_title_alias.csv',  encoding='UTF-8', quotechar=\"\\\"\")\n",
    "df_uk_id_orig_title_alias.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1239609, 2)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_uk_id_orig_title_alias.columns = ['id', 'title']\n",
    "df_uk_id_name_all = df_uk_id_name.append(df_uk_id_orig_title_alias)\n",
    "df_uk_id_name_all.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1231644, 2)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_uk_id_name_all['title_lc'] = df_uk_id_name_all['title'].str.lower()\n",
    "df_uk_id_name_all = df_uk_id_name_all[['id', 'title_lc']]\n",
    "df_uk_id_name_all = df_uk_id_name_all.drop_duplicates()\n",
    "df_uk_id_name_all.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data/ukwiki/ukwiki-20180620-pages-meta-current01-p1p5503930.xml_art.csv.gz\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current02-p5503931p11007860.xml_art.csv.gz\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current03-p11007861p16511790.xml_art.csv.gz\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current04-p16511791p22015720.xml_art.csv.gz\n"
     ]
    }
   ],
   "source": [
    "for fn in UKWIKI_ART_FNMS:\n",
    "    fn = PATH_TO_DATA_UK+fn\n",
    "    print(fn)\n",
    "    fn_new = unpack(fn)\n",
    "    df_articles = pd.read_csv(fn_new, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "    df_articles['link_val'] = df_articles['link_val'].str.split(\"#\").str[0]\n",
    "    df_articles = df_articles[df_articles['link_val'].notna()]\n",
    "    df_articles = df_articles[df_articles['link_val'].str.strip() != '']\n",
    "    df_articles['link_pos_perc'] = (df_articles['link_pos']/df_articles['text_len']).round(3)    \n",
    "    df_articles['link_val_lc'] = df_articles['link_val'].str.lower()\n",
    "    df_articles = pd.merge(df_articles, df_uk_id_name_all, how='left', left_on=['link_val_lc'], right_on=['title_lc'])\n",
    "    df_articles = df_articles[['id_x', 'id_y', 'link_pos', 'link_pos_perc', 'link_val', 'link_txt']]\n",
    "    df_articles.columns = ['id', 'link_id', 'link_pos', 'link_pos_perc', 'link_val', 'link_text']\n",
    "    df_articles[['link_id']] = df_articles[['link_id']].fillna(-1).astype(int)\n",
    "    df_articles[['link_pos']] = df_articles[['link_pos']].fillna(-1).astype(int)\n",
    "    df_articles[['link_pos_perc']] = df_articles[['link_pos_perc']].fillna(-1).astype(int)\n",
    "    df_articles['is_red_link'] = np.where(df_articles['link_id'] == -1, True, False)\n",
    "    df_articles.to_csv(fn_new, index = False, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "    pack_and_remove(fn_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_id_name = pd.read_csv(PATH_TO_DATA_EN+'enwiki-20180620-id_name.csv', encoding='ISO-8859-1', quotechar=\"'\", escapechar =\"\\\\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = PATH_TO_DATA_UK+'ukwiki-20180620-langlinks_en.csv.gz'\n",
    "print(fn)\n",
    "fn_new = unpack(fn)\n",
    "df_uk_en_links = pd.read_csv(fn_new, encoding='ISO-8859-1', quotechar=\"'\")\n",
    "df_uk_en_links = pd.merge(df_uk_en_links, df_id_name, how='left', left_on=['ll_title'], right_on=['title'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_uk_en_links = df_uk_en_links[['ll_from', 'id']]\n",
    "df_uk_en_links.columns = ['id_uk', 'id_en']\n",
    "df_uk_en_links[['id_uk']] = df_uk_en_links[['id_uk']].fillna(-1).astype(int)\n",
    "df_uk_en_links[['id_en']] = df_uk_en_links[['id_en']].fillna(-1).astype(int)\n",
    "df_uk_en_links.to_csv(PATH_TO_DATA_UK+'20180620-langlinks_uk_en.csv', index = False, encoding='ISO-8859-1', quotechar=\"'\", \n",
    "                       escapechar =\"\\\\\", quoting=csv.QUOTE_NONNUMERIC)\n",
    "pack_and_remove(fn_new)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
